AllLife Bank is a US bank that has a growing customer base. The majority of these customers are liability customers (depositors) with varying sizes of deposits. The number of customers who are also borrowers (asset customers) is quite small, and the bank is interested in expanding this base rapidly to bring in more loan business and in the process, earn more through the interest on loans. In particular, the management wants to explore ways of converting its liability customers to personal loan customers (while retaining them as depositors).
A campaign that the bank ran last year for liability customers showed a healthy conversion rate of over 9% success. This has encouraged the retail marketing department to devise campaigns with better target marketing to increase the success ratio.
You as a Data Scientist at AllLife Bank have to build a model that will help the marketing department to identify the potential customers who have a higher probability of purchasing the loan.
To predict whether a liability customer will buy personal loans, to understand which customer attributes are most significant in driving purchases, and to identify which segment of customers to target more.
The detailed data dictionary is given below.
Data Dictionary
# Libraries to help with reading and manipulating data
import pandas as pd
import numpy as np
# libaries to help with data visualization
import matplotlib.pyplot as plt
import seaborn as sns
# Library to split data
from sklearn.model_selection import train_test_split
# To build model for prediction
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
# To get diferent metric scores
from sklearn.metrics import (
f1_score,
accuracy_score,
recall_score,
precision_score,
confusion_matrix,
make_scorer
)
from sklearn.model_selection import train_test_split,GridSearchCV
# To ignore unnecessary warnings
import warnings
warnings.filterwarnings("ignore")
lm_data_master = pd.read_csv("/content/Loan_Modelling.csv")
# copying data to another variable to avoid any changes to original data
lm_data = lm_data_master.copy()
lm_data.head()
lm_data.shape
lm_data.info()
# checking for null values
lm_data.isnull().sum()
# checking for duplicate values
lm_data.duplicated().sum()
lm_data = lm_data.drop(["ID"], axis=1)
Let's check the statistical summary of the data.
lm_data.describe().T
def histogram_boxplot(data, feature, figsize=(15, 10), kde=False, bins=None):
"""
Boxplot and histogram combined
data: dataframe
feature: dataframe column
figsize: size of figure (default (15,10))
kde: whether to show the density curve (default False)
bins: number of bins for histogram (default None)
"""
f2, (ax_box2, ax_hist2) = plt.subplots(
nrows=2, # Number of rows of the subplot grid= 2
sharex=True, # x-axis will be shared among all subplots
gridspec_kw={"height_ratios": (0.25, 0.75)},
figsize=figsize,
) # creating the 2 subplots
sns.boxplot(
data=data, x=feature, ax=ax_box2, showmeans=True, color="violet"
) # boxplot will be created and a triangle will indicate the mean value of the column
sns.histplot(
data=data, x=feature, kde=kde, ax=ax_hist2, bins=bins
) if bins else sns.histplot(
data=data, x=feature, kde=kde, ax=ax_hist2
) # For histogram
ax_hist2.axvline(
data[feature].mean(), color="green", linestyle="--"
) # Add mean to the histogram
ax_hist2.axvline(
data[feature].median(), color="black", linestyle="-"
) # Add median to the histogram
# function to create labeled barplots
def labeled_barplot(data, feature, perc=False, n=None):
"""
Barplot with percentage at the top
data: dataframe
feature: dataframe column
perc: whether to display percentages instead of count (default is False)
n: displays the top n category levels (default is None, i.e., display all levels)
"""
total = len(data[feature]) # length of the column
count = data[feature].nunique()
if n is None:
plt.figure(figsize=(count + 2, 6))
else:
plt.figure(figsize=(n + 2, 6))
plt.xticks(rotation=90, fontsize=15)
ax = sns.countplot(
data=data,
x=feature,
palette="Paired",
order=data[feature].value_counts().index[:n],
)
for p in ax.patches:
if perc == True:
label = "{:.1f}%".format(
100 * p.get_height() / total
) # percentage of each class of the category
else:
label = p.get_height() # count of each level of the category
x = p.get_x() + p.get_width() / 2 # width of the plot
y = p.get_height() # height of the plot
ax.annotate(
label,
(x, y),
ha="center",
va="center",
size=12,
xytext=(0, 5),
textcoords="offset points",
) # annotate the percentage
plt.show() # show the plot
def stacked_barplot(data, predictor, target):
"""
Print the category counts and plot a stacked bar chart
data: dataframe
predictor: independent variable
target: target variable
"""
count = data[predictor].nunique()
sorter = data[target].value_counts().index[-1]
tab1 = pd.crosstab(data[predictor], data[target], margins=True).sort_values(
by=sorter, ascending=False
)
print(tab1)
print("-" * 120)
tab = pd.crosstab(data[predictor], data[target], normalize="index").sort_values(
by=sorter, ascending=False
)
tab.plot(kind="bar", stacked=True, figsize=(count + 5, 5))
plt.legend(
loc="lower left", frameon=False,
)
plt.legend(loc="upper left", bbox_to_anchor=(1, 1))
plt.show()
### function to plot distributions wrt target
def distribution_plot_wrt_target(data, predictor, target):
fig, axs = plt.subplots(2, 2, figsize=(12, 10))
target_uniq = data[target].unique()
axs[0, 0].set_title("Distribution of target for target=" + str(target_uniq[0]))
sns.histplot(
data=data[data[target] == target_uniq[0]],
x=predictor,
kde=True,
ax=axs[0, 0],
color="teal",
stat="density",
)
axs[0, 1].set_title("Distribution of target for target=" + str(target_uniq[1]))
sns.histplot(
data=data[data[target] == target_uniq[1]],
x=predictor,
kde=True,
ax=axs[0, 1],
color="orange",
stat="density",
)
axs[1, 0].set_title("Boxplot w.r.t target")
sns.boxplot(data=data, x=target, y=predictor, ax=axs[1, 0], palette="gist_rainbow")
axs[1, 1].set_title("Boxplot (without outliers) w.r.t target")
sns.boxplot(
data=data,
x=target,
y=predictor,
ax=axs[1, 1],
showfliers=False,
palette="gist_rainbow",
)
plt.tight_layout()
plt.show()
histogram_boxplot(lm_data, "Experience")
histogram_boxplot(lm_data, "Age")
histogram_boxplot(lm_data, "Income")
labeled_barplot(lm_data, "Family")
histogram_boxplot(lm_data, "Family")
histogram_boxplot(lm_data, "CCAvg")
labeled_barplot(lm_data, "Education")
histogram_boxplot(lm_data, "Education")
sns.boxplot(data=lm_data, x="Mortgage")
plt.show()
labeled_barplot(lm_data, "Personal_Loan")
labeled_barplot(lm_data, "Securities_Account")
labeled_barplot(lm_data, "CD_Account")
labeled_barplot(lm_data, "Online")
labeled_barplot(lm_data, "CreditCard")
sns.pairplot(lm_data)
cols_list = lm_data.select_dtypes(include=np.number).columns.tolist()
plt.figure(figsize=(12, 7))
sns.heatmap(
lm_data[cols_list].corr(), annot=True, vmin=-1, vmax=1, fmt=".2f", cmap="Spectral"
)
plt.show()
distribution_plot_wrt_target(lm_data,"Age", "Personal_Loan")
All the customers are in between 35 and 55 age and seems equally distributed.
distribution_plot_wrt_target(lm_data,"Experience", "Personal_Loan")
distribution_plot_wrt_target(lm_data,"Income", "Personal_Loan")
distribution_plot_wrt_target(lm_data,"Family", "Personal_Loan")
distribution_plot_wrt_target(lm_data,"CCAvg", "Personal_Loan")
distribution_plot_wrt_target(lm_data,"Education", "Personal_Loan")
distribution_plot_wrt_target(lm_data,"Mortgage", "Personal_Loan")
distribution_plot_wrt_target(lm_data,"Securities_Account", "Personal_Loan")
distribution_plot_wrt_target(lm_data,"CD_Account", "Personal_Loan")
distribution_plot_wrt_target(lm_data,"Online", "Personal_Loan")
distribution_plot_wrt_target(lm_data,"CreditCard", "Personal_Loan")
# Finding how many negative values on the experience column
lm_data[lm_data["Experience"] < 0]['Experience'].count()
#converting negatvie values to positive values.
lm_data['Experience'] = lm_data['Experience'].apply(lambda x : (x * -1) if x < 0 else x)
# Dropping zip code column
lm_data.drop('ZIPCode', axis =1, inplace=True)
X = lm_data.drop(["Personal_Loan"], axis=1)
Y = lm_data["Personal_Loan"]
X = pd.get_dummies(X, drop_first=True)
X = X.astype(float)
# Splitting data in train and test sets
X_train, X_test, y_train, y_test = train_test_split(
X, Y, test_size=0.30, random_state=1
)
print("Shape of Training set : ", X_train.shape)
print("Shape of test set : ", X_test.shape)
print("Percentage of classes in training set:")
print(y_train.value_counts(normalize=True))
print("Percentage of classes in test set:")
print(y_test.value_counts(normalize=True))
# defining a function to compute different metrics to check performance of a classification model built using sklearn
def model_performance_classification_sklearn(model, predictors, target):
"""
Function to compute different metrics to check classification model performance
model: classifier
predictors: independent variables
target: dependent variable
"""
# predicting using the independent variables
pred = model.predict(predictors)
acc = accuracy_score(target, pred) # to compute Accuracy
recall = recall_score(target, pred) # to compute Recall
precision = precision_score(target, pred) # to compute Precision
f1 = f1_score(target, pred) # to compute F1-score
# creating a dataframe of metrics
df_perf = pd.DataFrame(
{"Accuracy": acc, "Recall": recall, "Precision": precision, "F1": f1,},
index=[0],
)
return df_perf
def confusion_matrix_sklearn(model, predictors, target):
"""
To plot the confusion_matrix with percentages
model: classifier
predictors: independent variables
target: dependent variable
"""
y_pred = model.predict(predictors)
cm = confusion_matrix(target, y_pred)
labels = np.asarray(
[
["{0:0.0f}".format(item) + "\n{0:.2%}".format(item / cm.flatten().sum())]
for item in cm.flatten()
]
).reshape(2, 2)
plt.figure(figsize=(6, 4))
sns.heatmap(cm, annot=labels, fmt="")
plt.ylabel("True label")
plt.xlabel("Predicted label")
model0 = DecisionTreeClassifier(criterion="gini",random_state=1)
model0.fit(X_train, y_train)
confusion_matrix_sklearn(model0, X_train, y_train)
decision_tree_default_perf_train = model_performance_classification_sklearn(
model0, X_train, y_train
)
decision_tree_default_perf_train
confusion_matrix_sklearn(model0, X_test, y_test)
decision_tree_default_perf_test = model_performance_classification_sklearn(
model0, X_test, y_test
)
decision_tree_default_perf_test
# Define the parameters of the tree to iterate over
max_depth_values = np.arange(2, 7, 2)
max_leaf_nodes_values = [50, 75, 150, 250]
min_samples_split_values = [10, 30, 50, 70]
# Initialize variables to store the best model and its performance
best_estimator = None
best_score_diff = float('inf')
best_test_score = 0.0
# Iterate over all combinations of the specified parameter values
for max_depth in max_depth_values:
for max_leaf_nodes in max_leaf_nodes_values:
for min_samples_split in min_samples_split_values:
# Initialize the tree with the current set of parameters
estimator = DecisionTreeClassifier(
max_depth=max_depth,
max_leaf_nodes=max_leaf_nodes,
min_samples_split=min_samples_split,
class_weight='balanced',
random_state=42
)
# Fit the model to the training data
estimator.fit(X_train, y_train)
# Make predictions on the training and test sets
y_train_pred = estimator.predict(X_train)
y_test_pred = estimator.predict(X_test)
# Calculate recall scores for training and test sets
train_recall_score = recall_score(y_train, y_train_pred)
test_recall_score = recall_score(y_test, y_test_pred)
# Calculate the absolute difference between training and test recall scores
score_diff = abs(train_recall_score - test_recall_score)
# Update the best estimator and best score if the current one has a smaller score difference
if (score_diff < best_score_diff) & (test_recall_score > best_test_score):
best_score_diff = score_diff
best_test_score = test_recall_score
best_estimator = estimator
# Print the best parameters
print("Best parameters found:")
print(f"Max depth: {best_estimator.max_depth}")
print(f"Max leaf nodes: {best_estimator.max_leaf_nodes}")
print(f"Min samples split: {best_estimator.min_samples_split}")
print(f"Best test recall score: {best_test_score}")
model2 = estimator
# fitting the best model to the training data
model2.fit(X_train, y_train)
confusion_matrix_sklearn(model2, X_train, y_train)
decision_tree_tune_perf_train = model_performance_classification_sklearn(
model2, X_train, y_train
)
decision_tree_tune_perf_train
confusion_matrix_sklearn(model2, X_test, y_test)
decision_tree_tune_perf_test = model_performance_classification_sklearn(
model2, X_test, y_test
)
decision_tree_tune_perf_test
feature_names = list(X_train.columns)
importances = model2.feature_importances_
indices = np.argsort(importances)
plt.figure(figsize=(20, 10))
out = tree.plot_tree(
model2,
feature_names=feature_names,
filled=True,
fontsize=9,
node_ids=False,
class_names=None,
)
# below code will add arrows to the decision tree split if they are missing
for o in out:
arrow = o.arrow_patch
if arrow is not None:
arrow.set_edgecolor("black")
arrow.set_linewidth(1)
plt.show()
print(tree.export_text(model2, feature_names=feature_names, show_weights=True))
importances = model2.feature_importances_
indices = np.argsort(importances)
plt.figure(figsize=(12, 12))
plt.title("Feature Importances")
plt.barh(range(len(indices)), importances[indices], color="violet", align="center")
plt.yticks(range(len(indices)), [feature_names[i] for i in indices])
plt.xlabel("Relative Importance")
plt.show()
clf = DecisionTreeClassifier(random_state=1)
path = clf.cost_complexity_pruning_path(X_train, y_train)
ccp_alphas, impurities = path.ccp_alphas, path.impurities
pd.DataFrame(path)
fig, ax = plt.subplots(figsize=(15, 5))
ax.plot(ccp_alphas[:-1], impurities[:-1], marker="o", drawstyle="steps-post")
ax.set_xlabel("effective alpha")
ax.set_ylabel("total impurity of leaves")
ax.set_title("Total Impurity vs effective alpha for training set")
plt.show()
ccp_alphas is the alpha value that prunes the whole tree,
leaving the tree, clfs[-1], with one node.clfs = []
for ccp_alpha in ccp_alphas:
clf = DecisionTreeClassifier(random_state=1, ccp_alpha=ccp_alpha)
clf.fit(X_train, y_train)
clfs.append(clf)
print(
"Number of nodes in the last tree is: {} with ccp_alpha: {}".format(
clfs[-1].tree_.node_count, ccp_alphas[-1]
)
)
clfs = clfs[:-1]
ccp_alphas = ccp_alphas[:-1]
node_counts = [clf.tree_.node_count for clf in clfs]
depth = [clf.tree_.max_depth for clf in clfs]
fig, ax = plt.subplots(2, 1, figsize=(10, 7))
ax[0].plot(ccp_alphas, node_counts, marker="o", drawstyle="steps-post")
ax[0].set_xlabel("alpha")
ax[0].set_ylabel("number of nodes")
ax[0].set_title("Number of nodes vs alpha")
ax[1].plot(ccp_alphas, depth, marker="o", drawstyle="steps-post")
ax[1].set_xlabel("alpha")
ax[1].set_ylabel("depth of tree")
ax[1].set_title("Depth vs alpha")
fig.tight_layout()
recall_train = []
for clf in clfs:
pred_train = clf.predict(X_train)
values_train = recall_score(y_train, pred_train)
recall_train.append(values_train)
recall_test = []
for clf in clfs:
pred_test = clf.predict(X_test)
values_test = recall_score(y_test, pred_test)
recall_test.append(values_test)
fig, ax = plt.subplots(figsize=(15, 5))
ax.set_xlabel("alpha")
ax.set_ylabel("Recall")
ax.set_title("Recall vs alpha for training and testing sets")
ax.plot(ccp_alphas, recall_train, marker="o", label="train", drawstyle="steps-post")
ax.plot(ccp_alphas, recall_test, marker="o", label="test", drawstyle="steps-post")
ax.legend()
plt.show()
# creating the model where we get highest train and test recall
index_best_model = np.argmax(recall_test)
best_model = clfs[index_best_model]
print(best_model)
model3 = best_model
confusion_matrix_sklearn(model3, X_train, y_train)
decision_tree_postpruned_perf_train = model_performance_classification_sklearn(
model3, X_train, y_train
)
decision_tree_postpruned_perf_train
decision_tree_postpruned_perf_test = model_performance_classification_sklearn(
model3, X_test, y_test
)
decision_tree_postpruned_perf_test
confusion_matrix_sklearn(model3, X_train, y_train)
plt.figure(figsize=(20, 10))
out = tree.plot_tree(
model3,
feature_names=feature_names,
filled=True,
fontsize=9,
node_ids=False,
class_names=None,
)
# below code will add arrows to the decision tree split if they are missing
for o in out:
arrow = o.arrow_patch
if arrow is not None:
arrow.set_edgecolor("black")
arrow.set_linewidth(1)
plt.show()
print(tree.export_text(model3, feature_names=feature_names, show_weights=True))
print(
pd.DataFrame(
model3.feature_importances_, columns=["Imp"], index=X_train.columns
).sort_values(by="Imp", ascending=False)
)
importances = model3.feature_importances_
indices = np.argsort(importances)
plt.figure(figsize=(12, 12))
plt.title("Feature Importances")
plt.barh(range(len(indices)), importances[indices], color="violet", align="center")
plt.yticks(range(len(indices)), [feature_names[i] for i in indices])
plt.xlabel("Relative Importance")
plt.show()
# training performance comparison
models_train_comp_df = pd.concat(
[
decision_tree_default_perf_train.T,
decision_tree_tune_perf_train.T,
decision_tree_postpruned_perf_train.T,
],
axis=1,
)
models_train_comp_df.columns = [
"Decision Tree (sklearn default)",
"Decision Tree (Pre-Pruning)",
"Decision Tree (Post-Pruning)",
]
print("Training performance comparison:")
models_train_comp_df
# test performance comparison
models_train_comp_df = pd.concat(
[
decision_tree_default_perf_test.T,
decision_tree_tune_perf_test.T,
decision_tree_postpruned_perf_test.T,
],
axis=1,
)
models_train_comp_df.columns = [
"Decision Tree (sklearn default)",
"Decision Tree (Pre-Pruning)",
"Decision Tree (Post-Pruning)",
]
print("Test set performance comparison:")
models_train_comp_df
Decision tree model performs well on training set with an accuracy of 0.99, precision of 0.96, recall of 0.95 and F1 score of 0.96 after post prunning on the training set.
Decision tree model performs well on training set with an accuracy of 0.98, precision of 0.90, recall of 0.93 and F1 score of 0.91 after post prunning on the training set.